[Proposed] df50_tr,df001_tst 코드보기_인덱스안겹치게

Author

김보람

Published

February 16, 2024

imports

import pandas as pd
import numpy as np
import sklearn
import pickle 
import time 
import datetime
import warnings
warnings.filterwarnings('ignore')
%run ../function_proposed_gcn.py
with open('../fraudTrain.pkl', 'rb') as file:
    fraudTrain = pickle.load(file)    
df50 = throw(fraudTrain,0.5)
df_tr, df_tst = sklearn.model_selection.train_test_split(df50)

dfn = fraudTrain[::10]
dfnn = dfn[~dfn.index.isin(df_tr.index)]
dfnn.is_fraud.mean()
0.0014052108297481207
dfnn = dfnn.reset_index(drop=True)
df_trn, df_tstn = sklearn.model_selection.train_test_split(dfnn)
df_tr.shape,df_tstn.shape
((9009, 22), (25975, 22))
df2, mask = concat(df_tr, df_tstn)
df2['index'] = df2.index
df = df2.reset_index()
df.is_fraud.mean(), df_tr.is_fraud.mean(), df_tstn.is_fraud.mean()
(0.1296878573061971, 0.4997224997224997, 0.001347449470644851)
groups = df.groupby('cc_num')
edge_index = np.array([item for sublist in (compute_time_difference(group) for _, group in groups) for item in sublist])
edge_index = edge_index.astype(np.float64)
edge_index[:,2].mean()
11358175.07907386
plt.hist(edge_index[:,2])
(array([396764., 293830., 253646., 219982., 184930., 141202., 106036.,
         71386.,  31362.,  10150.]),
 array([       0.,  3750282.,  7500564., 11250846., 15001128., 18751410.,
        22501692., 26251974., 30002256., 33752538., 37502820.]),
 <BarContainer object of 10 artists>)

theta = edge_index[:,2].mean()
edge_index[:,2] = (np.exp(-edge_index[:,2]/(theta)) != 1)*(np.exp(-edge_index[:,2]/(theta))).tolist()
gamma = 0.8
edge_index = torch.tensor([(int(row[0]), int(row[1])) for row in edge_index if row[2] > gamma], dtype=torch.long).t()
x = torch.tensor(df['amt'].values, dtype=torch.float).reshape(-1,1)
y = torch.tensor(df['is_fraud'].values,dtype=torch.int64)
data = torch_geometric.data.Data(x=x, edge_index = edge_index, y=y, train_mask = mask[0], test_mask= mask[1])
model = GCN1()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
yy = (data.y[data.test_mask]).numpy()
yyhat, yyhat_ = train_and_evaluate_model(data, model, optimizer)
yyhat_ = yyhat_.detach().numpy()
eval = evaluation(yy, yyhat, yyhat_)
eval
{'acc': 0.9131857555341675,
 'pre': 0.01486013986013986,
 'rec': 0.9714285714285714,
 'f1': 0.029272492466637965,
 'auc': 0.9741133384734002}